Now we run this a second time, on the second (b
) feature table that has removed all epithets with fewer than 27 representative documents. The results are better (overall F1 score for decision tree is 0.44
, random forest is 0.47
; in a
these were 0.33
and 0.40
, respectively).
In [1]:
import os
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [2]:
from sklearn import clone
from sklearn import preprocessing
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
import datetime as dt
In [3]:
fp_df = os.path.expanduser('~/cltk_data/user_data/tlg_bow_df.pickle')
dataframe_bow = joblib.load(fp_df)
In [4]:
Y = dataframe_bow['epithet']
In [5]:
X = dataframe_bow.drop(['epithet', 'id', 'author'], 1)
In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
In [7]:
def scale_data(X_train, X_test, Y_train, Y_test):
"""Take Vectors,
"""
'''
-PREPOCESSING
-Here, scaled data has zero mean and unit varience
-We save the scaler to later use with testing/prediction data
'''
print('Scaling data ...')
t0 = dt.datetime.utcnow()
scaler = preprocessing.StandardScaler().fit(X_train)
fp_scaler = os.path.expanduser('~/cltk_data/user_data/tlg_bow_scaler.pickle')
joblib.dump(scaler, fp_scaler)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
return X_train_scaled, X_test_scaled, Y_train, Y_test
In [8]:
X_train_scaled, X_test_scaled, Y_train, Y_test = scale_data(X_train, X_test, Y_train, Y_test)
In [9]:
def run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Run decision tree with scikit.
Experiment with: 'max_depth'
"""
'''
-This is where we define the models with pre-defined parameters
-We can learn these parameters given our data
'''
print('Defining and fitting models ...')
t0 = dt.datetime.utcnow()
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_dt.pickle')
joblib.dump(dec_tree, fp_model_pickle)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction_tree = dec_tree.predict(X_test_scaled)
print('tree_predictions ', Y_prediction_tree)
expected = Y_test
print('actual_values ', expected)
print()
print('----Tree_report--------------------------------')
print(classification_report(expected, Y_prediction_tree))
In [10]:
run_tree(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [12]:
def run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Scikit random forest
Experiment with 'n_estimators'
"""
t0 = dt.datetime.utcnow()
n_estimators = 30
rf_model = RandomForestClassifier(n_estimators=n_estimators)
# Train
clf = clone(rf_model)
clf = rf_model.fit(X_train_scaled, Y_train)
#joblib.dump(clf, 'models/random_forest.pickle')
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_fandom_forest.pickle')
joblib.dump(clf, fp_model_pickle)
scores = clf.score(X_train_scaled, Y_train)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction = clf.predict(X_test_scaled)
print('tree_predictions ', Y_prediction)
expected = Y_test
print('actual_values ', expected)
print()
print('----Random forest report--------------------------------')
print(classification_report(expected, Y_prediction))
In [13]:
run_random_forest(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [ ]:
def run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Run SVC with scikit."""
# This is where we define the models with pre-defined parameters
# We can learn these parameters given our data
print('Defining and fitting SVC model ...')
t0 = dt.datetime.utcnow()
scv = svm.LinearSVC(C=100.)
scv.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_svc.pickle')
joblib.dump(scv, fp_model_pickle)
print('... finished in {} secs.'.format(dt.datetime.utcnow() - t0))
print()
Y_prediction_svc = scv.predict(X_test_scaled)
print('svc_predictions ', Y_prediction_svc)
expected = Y_test
print('actual_values ', expected)
print()
print('----SVC_report--------------------------------')
print(classification_report(expected, Y_prediction_svc))
In [ ]:
run_svc(X_train_scaled, X_test_scaled, Y_train, Y_test)
In [ ]:
def run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test):
"""Scikit random forest.
For plotting see:
http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_iris.html
Experiment with 'n_estimators'
"""
n_estimators = 30
ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
n_estimators=n_estimators)
# Train
clf = clone(ada_classifier)
clf = ada_classifier.fit(X_train_scaled, Y_train)
fp_model_pickle = os.path.expanduser('~/cltk_data/user_data/tlg_bow_ada_boost.pickle')
joblib.dump(clf, fp_model_pickle)
scores = clf.score(X_train_scaled, Y_train)
Y_prediction = clf.predict(X_test_scaled)
print('tree_predictions ', Y_prediction)
expected = Y_test
print('actual_values ', expected)
print()
print(classification_report(expected, Y_prediction))
In [ ]:
run_ada_boost(X_train_scaled, X_test_scaled, Y_train, Y_test)